In [1]:
import os
import pandas as pd
import numpy as np
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings
warnings.filterwarnings('ignore')
from lightgbm import LGBMRegressor
import joblib
In [2]:
import numpy as np                   # array, vector, matrix calculations
import pandas as pd                  # DataFrame handling
import shap                          # for consistent, signed variable importance measurements
import xgboost as xgb                # gradient boosting machines (GBMs)

import matplotlib.pyplot as plt      # plotting
pd.options.display.max_columns = 999 # enable display of all columns in notebook

# enables display of plots in notebook
%matplotlib inline

np.random.seed(12345)    
In [3]:
sales = pd.read_csv('sales_train_evaluation.csv')
sales.name = 'sales'
calendar = pd.read_csv('calendar.csv')
calendar.name = 'calendar'
prices = pd.read_csv('sell_prices.csv')
prices.name = 'prices'
In [4]:
for d in range(1942,1970):
    col = 'd_' + str(d)
    sales[col] = 0
    sales[col] = sales[col].astype(np.int16)  #add sales infoemation from d1942 to d1969 which set as 0
In [5]:
sales_bd = np.round(sales.memory_usage().sum()/(1024*1024),1)
calendar_bd = np.round(calendar.memory_usage().sum()/(1024*1024),1)
prices_bd = np.round(prices.memory_usage().sum()/(1024*1024),1)
In [6]:
#Downcast in order to save memory
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  

sales = downcast(sales)
prices = downcast(prices)
calendar = downcast(calendar)
In [7]:
sales_ad = np.round(sales.memory_usage().sum()/(1024*1024),1)
calendar_ad = np.round(calendar.memory_usage().sum()/(1024*1024),1)
prices_ad = np.round(prices.memory_usage().sum()/(1024*1024),1)
In [8]:
dic = {'DataFrame':['sales','calendar','prices'],
       'Before downcasting':[sales_bd,calendar_bd,prices_bd],
       'After downcasting':[sales_ad,calendar_ad,prices_ad]}

memory = pd.DataFrame(dic)
memory = pd.melt(memory, id_vars='DataFrame', var_name='Status', value_name='Memory (MB)')
memory.sort_values('Memory (MB)',inplace=True)
fig = px.bar(memory, x='DataFrame', y='Memory (MB)', color='Status', barmode='group', text='Memory (MB)')
fig.update_traces(texttemplate='%{text} MB', textposition='outside')
fig.update_layout(template='seaborn', title='Effect of Downcasting')
fig.show()
In [9]:
df = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sold').dropna()
In [10]:
df = pd.merge(df, calendar, on='d', how='left')
df = pd.merge(df, prices, on=['store_id','item_id','wm_yr_wk'], how='left') 
In [11]:
#remove the information of 12-25 of each year
df=df[-(df['date']== "2011-12-25")&-(df['date']== "2012-12-25")&-(df['date']== "2013-12-25")&-(df['date']== "2014-12-25")&-(df['date']== "2015-12-25")]
In [12]:
group = sales.groupby(['state_id','store_id','cat_id','dept_id'],as_index=False)['item_id'].count().dropna()
group['USA'] = 'United States of America'
group.rename(columns={'state_id':'State','store_id':'Store','cat_id':'Category','dept_id':'Department','item_id':'Count'},inplace=True)
In [13]:
group_price_store = df.groupby(['state_id','store_id','item_id'],as_index=False)['sell_price'].mean().dropna()
In [14]:
group_price_cat = df.groupby(['store_id','cat_id','item_id'],as_index=False)['sell_price'].mean().dropna()
In [15]:
group = df.groupby(['year','date','state_id','store_id'], as_index=False)['sold'].sum().dropna()
In [16]:
fig = go.Figure()
title = 'Items sold over time'
years = group.year.unique().tolist()
buttons = []
y=3
for state in group.state_id.unique().tolist():
    group_state = group[group['state_id']==state]
    for store in group_state.store_id.unique().tolist():
        group_state_store = group_state[group_state['store_id']==store]
        fig.add_trace(go.Scatter(name=store, x=group_state_store['date'], y=group_state_store['sold'], showlegend=True, 
                                   yaxis='y'+str(y) if y!=1 else 'y'))
    y-=1

fig.update_layout(
        xaxis=dict(
        #autorange=True,
        range = ['2011-01-29','2016-05-22'],
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(count=2,
                     label="2y",
                     step="year",
                     stepmode="backward"),
                dict(count=3,
                     label="3y",
                     step="year",
                     stepmode="backward"),
                dict(count=4,
                     label="4y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            autorange=True,
        ),
        type="date"
    ),
    yaxis=dict(
        anchor="x",
        autorange=True,
        domain=[0, 0.33],
        mirror=True,
        showline=True,
        side="left",
        tickfont={"size":10},
        tickmode="auto",
        ticks="",
        title='WI',
        titlefont={"size":20},
        type="linear",
        zeroline=False
    ),
    yaxis2=dict(
        anchor="x",
        autorange=True,
        domain=[0.33, 0.66],
        mirror=True,
        showline=True,
        side="left",
        tickfont={"size":10},
        tickmode="auto",
        ticks="",
        title = 'TX',
        titlefont={"size":20},
        type="linear",
        zeroline=False
    ),
    yaxis3=dict(
        anchor="x",
        autorange=True,
        domain=[0.66, 1],
        mirror=True,
        showline=True,
        side="left",
        tickfont={"size":10},
        tickmode="auto",
        ticks='',
        title="CA",
        titlefont={"size":20},
        type="linear",
        zeroline=False
    )
    )
fig.update_layout(template='seaborn', title=title)
fig.show()
In [17]:
df['revenue'] = df['sold']*df['sell_price'].astype(np.float32)
In [18]:
def introduce_nulls(df):
    idx = pd.date_range(df.date.dt.date.min(), df.date.dt.date.max())
    df = df.set_index('date')
    df = df.reindex(idx)
    df.reset_index(inplace=True)
    df.rename(columns={'index':'date'},inplace=True)
    return df

def plot_metric(df,state,store,metric):
    store_sales = df[(df['state_id']==state)&(df['store_id']==store)&(df['date']<='2016-05-22')]
    food_sales = store_sales[store_sales['cat_id']=='FOODS']
    store_sales = store_sales.groupby(['date','snap_'+state],as_index=False)['sold','revenue'].sum()
    snap_sales = store_sales[store_sales['snap_'+state]==1]
    non_snap_sales = store_sales[store_sales['snap_'+state]==0]
    food_sales = food_sales.groupby(['date','snap_'+state],as_index=False)['sold','revenue'].sum()
    snap_foods = food_sales[food_sales['snap_'+state]==1]
    non_snap_foods = food_sales[food_sales['snap_'+state]==0]
    non_snap_sales = introduce_nulls(non_snap_sales)
    snap_sales = introduce_nulls(snap_sales)
    non_snap_foods = introduce_nulls(non_snap_foods)
    snap_foods = introduce_nulls(snap_foods)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=non_snap_sales['date'],y=non_snap_sales[metric],
                           name='Total '+metric+'(Non-SNAP)'))
    fig.add_trace(go.Scatter(x=snap_sales['date'],y=snap_sales[metric],
                           name='Total '+metric+'(SNAP)'))
    fig.add_trace(go.Scatter(x=non_snap_foods['date'],y=non_snap_foods[metric],
                           name='Food '+metric+'(Non-SNAP)'))
    fig.add_trace(go.Scatter(x=snap_foods['date'],y=snap_foods[metric],
                           name='Food '+metric+'(SNAP)'))
    fig.update_yaxes(title_text='Total items sold' if metric=='sold' else 'Total revenue($)')
    fig.update_layout(template='seaborn',title=store)
    fig.update_layout(
        xaxis=dict(
        #autorange=True,
        range = ['2011-01-29','2016-05-22'],
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(count=2,
                     label="2y",
                     step="year",
                     stepmode="backward"),
                dict(count=3,
                     label="3y",
                     step="year",
                     stepmode="backward"),
                dict(count=4,
                     label="4y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            autorange=True,
        ),
        type="date"
    ))
    return fig
In [19]:
cal_data = group.copy()
cal_data = cal_data[cal_data.date <= '22-05-2016']
cal_data['week'] = cal_data.date.dt.weekofyear
cal_data['day_name'] = cal_data.date.dt.day_name()
In [20]:
def calmap(cal_data, state, store, scale):
    cal_data = cal_data[(cal_data['state_id']==state)&(cal_data['store_id']==store)]
    years = cal_data.year.unique().tolist()
    fig = make_subplots(rows=len(years),cols=1,shared_xaxes=True,vertical_spacing=0.005)
    r=1
    for year in years:
        data = cal_data[cal_data['year']==year]
        data = introduce_nulls(data)
        fig.add_trace(go.Heatmap(
            z=data.sold,
            x=data.week,
            y=data.day_name,
            hovertext=data.date.dt.date,
            coloraxis = "coloraxis",name=year,
        ),r,1)
        fig.update_yaxes(title_text=year,tickfont=dict(size=5),row = r,col = 1)
        r+=1
    fig.update_xaxes(range=[1,53],tickfont=dict(size=10), nticks=53)
    fig.update_layout(coloraxis = {'colorscale':scale})
    fig.update_layout(template='seaborn', title=store)
    return fig
In [21]:
d_id = dict(zip(df.id.cat.codes, df.id))
d_item_id = dict(zip(df.item_id.cat.codes, df.item_id))
d_dept_id = dict(zip(df.dept_id.cat.codes, df.dept_id))
d_cat_id = dict(zip(df.cat_id.cat.codes, df.cat_id))
d_store_id = dict(zip(df.store_id.cat.codes, df.store_id))
d_state_id = dict(zip(df.state_id.cat.codes, df.state_id))
In [22]:
del group, group_price_cat, group_price_store, group_state, group_state_store, cal_data
gc.collect();

#2
df.d = df['d'].apply(lambda x: x.split('_')[1]).astype(np.int16)
cols = df.dtypes.index.tolist()
types = df.dtypes.values.tolist()
for i,type in enumerate(types):
    if type.name == 'category':
        df[cols[i]] = df[cols[i]].cat.codes
        
#3
df.drop('date',axis=1,inplace=True)
In [23]:
lags = [1,2,3,6,12,24,36]
for lag in lags:
    df['sold_lag_'+str(lag)] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],as_index=False)['sold'].shift(lag).astype(np.float16)
In [24]:
df['iteam_sold_avg'] = df.groupby('item_id')['sold'].transform('mean').astype(np.float16)
df['state_sold_avg'] = df.groupby('state_id')['sold'].transform('mean').astype(np.float16)
df['store_sold_avg'] = df.groupby('store_id')['sold'].transform('mean').astype(np.float16)
df['cat_sold_avg'] = df.groupby('cat_id')['sold'].transform('mean').astype(np.float16)
df['dept_sold_avg'] = df.groupby('dept_id')['sold'].transform('mean').astype(np.float16)
df['cat_dept_sold_avg'] = df.groupby(['cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
df['store_item_sold_avg'] = df.groupby(['store_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['cat_item_sold_avg'] = df.groupby(['cat_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['dept_item_sold_avg'] = df.groupby(['dept_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_sold_avg'] = df.groupby(['state_id','store_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_cat_sold_avg'] = df.groupby(['state_id','store_id','cat_id'])['sold'].transform('mean').astype(np.float16)
df['store_cat_dept_sold_avg'] = df.groupby(['store_id','cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
In [25]:
df['rolling_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.rolling(window=7).mean()).astype(np.float16)
In [26]:
df['expanding_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.expanding(2).mean()).astype(np.float16)
In [27]:
df['daily_avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','d'])['sold'].transform('mean').astype(np.float16)
df['avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform('mean').astype(np.float16)
df['selling_trend'] = (df['daily_avg_sold'] - df['avg_sold']).astype(np.float16)
df.drop(['daily_avg_sold','avg_sold'],axis=1,inplace=True)
In [28]:
df = df[df['d']>=36]
In [29]:
df
Out[29]:
id item_id dept_id cat_id store_id state_id d sold wm_yr_wk weekday wday month year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price revenue sold_lag_1 sold_lag_2 sold_lag_3 sold_lag_6 sold_lag_12 sold_lag_24 sold_lag_36 iteam_sold_avg state_sold_avg store_sold_avg cat_sold_avg dept_sold_avg cat_dept_sold_avg store_item_sold_avg cat_item_sold_avg dept_item_sold_avg state_store_sold_avg state_store_cat_sold_avg store_cat_dept_sold_avg rolling_sold_mean expanding_sold_mean selling_trend
1067150 14370 1437 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.217041 1.218750 1.307617 0.562500 0.697754 0.697754 0.322266 0.217041 0.217041 1.307617 0.803711 1.022461 0.0 0.000000 -0.322266
1067151 14380 1438 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.260498 1.218750 1.307617 0.562500 0.697754 0.697754 0.254639 0.260498 0.260498 1.307617 0.803711 1.022461 0.0 0.000000 -0.254639
1067152 14390 1439 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.076904 1.218750 1.307617 0.562500 0.697754 0.697754 0.157349 0.076904 0.076904 1.307617 0.803711 1.022461 0.0 0.000000 -0.157349
1067153 14400 1440 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 4.339844 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 2.017578 1.218750 1.307617 0.562500 0.697754 0.697754 1.699219 2.017578 2.017578 1.307617 0.803711 1.022461 0.0 0.000000 -1.699219
1067154 14410 1441 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.757324 1.218750 1.307617 0.562500 0.697754 0.697754 0.961426 0.757324 0.757324 1.307617 0.803711 1.022461 0.0 0.000000 -0.961426
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60034805 14329 1432 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.799805 1.030273 1.092773 1.626953 2.037109 2.037109 0.529297 0.799805 0.799805 1.092773 1.706055 2.214844 0.0 0.529297 -0.529297
60034806 14339 1433 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.480469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.431396 1.030273 1.092773 1.626953 2.037109 2.037109 0.370605 0.431396 0.431396 1.092773 1.706055 2.214844 0.0 0.370605 -0.370605
60034807 14349 1434 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 3.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.706055 1.030273 1.092773 1.626953 2.037109 2.037109 0.882812 0.706055 0.706055 1.092773 1.706055 2.214844 0.0 0.882812 -0.882812
60034808 14359 1435 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.280273 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.638672 1.030273 1.092773 1.626953 2.037109 2.037109 0.376221 0.638672 0.638672 1.092773 1.706055 2.214844 0.0 0.376221 -0.376221
60034809 14369 1436 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.618164 1.030273 1.092773 1.626953 2.037109 2.037109 0.694336 0.618164 0.618164 1.092773 1.706055 2.214844 0.0 0.694336 -0.694336

58815210 rows × 44 columns

In [30]:
df.to_pickle('data.pkl')
del df
gc.collect();
In [31]:
df = pd.read_pickle('data.pkl')
In [32]:
df
Out[32]:
id item_id dept_id cat_id store_id state_id d sold wm_yr_wk weekday wday month year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price revenue sold_lag_1 sold_lag_2 sold_lag_3 sold_lag_6 sold_lag_12 sold_lag_24 sold_lag_36 iteam_sold_avg state_sold_avg store_sold_avg cat_sold_avg dept_sold_avg cat_dept_sold_avg store_item_sold_avg cat_item_sold_avg dept_item_sold_avg state_store_sold_avg state_store_cat_sold_avg store_cat_dept_sold_avg rolling_sold_mean expanding_sold_mean selling_trend
1067150 14370 1437 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.217041 1.218750 1.307617 0.562500 0.697754 0.697754 0.322266 0.217041 0.217041 1.307617 0.803711 1.022461 0.0 0.000000 -0.322266
1067151 14380 1438 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.260498 1.218750 1.307617 0.562500 0.697754 0.697754 0.254639 0.260498 0.260498 1.307617 0.803711 1.022461 0.0 0.000000 -0.254639
1067152 14390 1439 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.076904 1.218750 1.307617 0.562500 0.697754 0.697754 0.157349 0.076904 0.076904 1.307617 0.803711 1.022461 0.0 0.000000 -0.157349
1067153 14400 1440 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 4.339844 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 2.017578 1.218750 1.307617 0.562500 0.697754 0.697754 1.699219 2.017578 2.017578 1.307617 0.803711 1.022461 0.0 0.000000 -1.699219
1067154 14410 1441 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.757324 1.218750 1.307617 0.562500 0.697754 0.697754 0.961426 0.757324 0.757324 1.307617 0.803711 1.022461 0.0 0.000000 -0.961426
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60034805 14329 1432 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.799805 1.030273 1.092773 1.626953 2.037109 2.037109 0.529297 0.799805 0.799805 1.092773 1.706055 2.214844 0.0 0.529297 -0.529297
60034806 14339 1433 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.480469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.431396 1.030273 1.092773 1.626953 2.037109 2.037109 0.370605 0.431396 0.431396 1.092773 1.706055 2.214844 0.0 0.370605 -0.370605
60034807 14349 1434 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 3.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.706055 1.030273 1.092773 1.626953 2.037109 2.037109 0.882812 0.706055 0.706055 1.092773 1.706055 2.214844 0.0 0.882812 -0.882812
60034808 14359 1435 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.280273 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.638672 1.030273 1.092773 1.626953 2.037109 2.037109 0.376221 0.638672 0.638672 1.092773 1.706055 2.214844 0.0 0.376221 -0.376221
60034809 14369 1436 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.618164 1.030273 1.092773 1.626953 2.037109 2.037109 0.694336 0.618164 0.618164 1.092773 1.706055 2.214844 0.0 0.694336 -0.694336

58815210 rows × 44 columns

In [33]:
#X_train, y_train = df[df['d']<1914].drop('sold',axis=1), df[df['d']<1914]['sold']
##X_valid, y_valid = df[(df['d']>=1914) & (df['d']<1942)].drop('sold',axis=1), df[(df['d']>=1914) & (df['d']<1942)]['sold']
#X_test = df[df['d']>=1942].drop('sold',axis=1)
In [34]:
data = df
valid1 = data[(data['d']>=1914) & (data['d']<1942)][['id','d','sold']]
test1 = data[data['d']>=1942][['id','d','sold']]
eval_preds = test1['sold']
valid_preds = valid1['sold']
In [35]:
df
Out[35]:
id item_id dept_id cat_id store_id state_id d sold wm_yr_wk weekday wday month year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price revenue sold_lag_1 sold_lag_2 sold_lag_3 sold_lag_6 sold_lag_12 sold_lag_24 sold_lag_36 iteam_sold_avg state_sold_avg store_sold_avg cat_sold_avg dept_sold_avg cat_dept_sold_avg store_item_sold_avg cat_item_sold_avg dept_item_sold_avg state_store_sold_avg state_store_cat_sold_avg store_cat_dept_sold_avg rolling_sold_mean expanding_sold_mean selling_trend
1067150 14370 1437 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.217041 1.218750 1.307617 0.562500 0.697754 0.697754 0.322266 0.217041 0.217041 1.307617 0.803711 1.022461 0.0 0.000000 -0.322266
1067151 14380 1438 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.260498 1.218750 1.307617 0.562500 0.697754 0.697754 0.254639 0.260498 0.260498 1.307617 0.803711 1.022461 0.0 0.000000 -0.254639
1067152 14390 1439 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.076904 1.218750 1.307617 0.562500 0.697754 0.697754 0.157349 0.076904 0.076904 1.307617 0.803711 1.022461 0.0 0.000000 -0.157349
1067153 14400 1440 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 4.339844 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 2.017578 1.218750 1.307617 0.562500 0.697754 0.697754 1.699219 2.017578 2.017578 1.307617 0.803711 1.022461 0.0 0.000000 -1.699219
1067154 14410 1441 3 1 0 0 36 0 11106 2 1 3 2011 -1 -1 -1 -1 1 1 1 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.757324 1.218750 1.307617 0.562500 0.697754 0.697754 0.961426 0.757324 0.757324 1.307617 0.803711 1.022461 0.0 0.000000 -0.961426
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60034805 14329 1432 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.799805 1.030273 1.092773 1.626953 2.037109 2.037109 0.529297 0.799805 0.799805 1.092773 1.706055 2.214844 0.0 0.529297 -0.529297
60034806 14339 1433 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 2.480469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.431396 1.030273 1.092773 1.626953 2.037109 2.037109 0.370605 0.431396 0.431396 1.092773 1.706055 2.214844 0.0 0.370605 -0.370605
60034807 14349 1434 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 3.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.706055 1.030273 1.092773 1.626953 2.037109 2.037109 0.882812 0.706055 0.706055 1.092773 1.706055 2.214844 0.0 0.882812 -0.882812
60034808 14359 1435 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.280273 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.638672 1.030273 1.092773 1.626953 2.037109 2.037109 0.376221 0.638672 0.638672 1.092773 1.706055 2.214844 0.0 0.376221 -0.376221
60034809 14369 1436 2 0 9 2 1969 0 11621 3 2 6 2016 16 3 2 0 0 0 0 1.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.618164 1.030273 1.092773 1.626953 2.037109 2.037109 0.694336 0.618164 0.618164 1.092773 1.706055 2.214844 0.0 0.694336 -0.694336

58815210 rows × 44 columns

In [36]:
y = 'sold'
X = [name for name in df.columns if name not in [y,'id']]
print('y =', y)
print('X =', X)
y = sold
X = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'revenue', 'sold_lag_1', 'sold_lag_2', 'sold_lag_3', 'sold_lag_6', 'sold_lag_12', 'sold_lag_24', 'sold_lag_36', 'iteam_sold_avg', 'state_sold_avg', 'store_sold_avg', 'cat_sold_avg', 'dept_sold_avg', 'cat_dept_sold_avg', 'store_item_sold_avg', 'cat_item_sold_avg', 'dept_item_sold_avg', 'state_store_sold_avg', 'state_store_cat_sold_avg', 'store_cat_dept_sold_avg', 'rolling_sold_mean', 'expanding_sold_mean', 'selling_trend']
In [37]:
stores = sales.store_id.cat.codes.unique().tolist()
for store in stores:
    df = data[data['store_id']==store]
    
    #Split the data
    #X_train, y_train = df[df['d']<1914].drop('sold',axis=1), df[df['d']<1914]['sold']
    #X_valid, y_valid = df[(df['d']>=1914) & (df['d']<1942)].drop('sold',axis=1), df[(df['d']>=1914) & (df['d']<1942)]['sold']
    #X_test = df[df['d']>=1942].drop('sold',axis=1)
    
    
    
    train = df[df['d']<1914]
    valid = df[(df['d']>=1914) & (df['d']<1942)]
    test=df[df['d']>=1942]
    
    
    dtrain = xgb.DMatrix(train[X], train[y])
    dtest = xgb.DMatrix(test[X], test[y])
    dvaild=xgb.DMatrix(valid[X], valid[y])
    #Train and validate
    #model = LGBMRegressor(
        #n_estimators=1000,
        #learning_rate=0.3,
        #subsample=0.8,
        #colsample_bytree=0.8,
        #max_depth=8,
        #num_leaves=50,
        #min_child_weight=300
    #)
    params = {
    #'objective': 'binary:logistic',             # produces 0-1 probabilities for binary classification
        'booster': 'gbtree',                        # base learner will be decision tree
        'eval_metric': 'rmse',                       # stop training based on maximum AUC, AUC always between 0-1
        'eta': 0.08,                                # learning rate
        'subsample': 0.8,                           # use 90% of rows in each decision tree
        'colsample_bytree': 0.8,                    # use 90% of columns in each decision tree
        'max_depth': 8,                            # allow decision trees to grow to depth of 15
    #'monotone_constraints':mono_constraints,    # 1 = increasing relationship, -1 = decreasing relationship
    #'base_score': base_y,                       # calibrate predictions to mean of y 
        'seed': 12345                               # set random seed for reproducibility
    }
    watchlist = [(dtrain, 'train'), (dvaild, 'eval')]
    xgb_model = xgb.train(params,                   # set tuning parameters from above                   
                          dtrain,                   # training data
                          1000,                     # maximum of 1000 iterations (trees)
                          evals=watchlist,          # use watchlist for early stopping 
                          early_stopping_rounds=15, # stop after 50 iterations (trees) without increase in AUC
                          verbose_eval=5)    
    
    
    
    
    
    
    #print('*****Prediction for Store: {}*****'.format(d_store_id[store]))
    #model.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)],
            # eval_metric='rmse', verbose=20, early_stopping_rounds=20)
        
        
        
    eval_preds[test.index]= xgb_model.predict(dtest)
    valid_preds[valid.index] = xgb_model.predict(dvaild)
    #valid_preds[X_valid.index] = model.predict(X_valid)
    #eval_preds[X_test.index] = model.predict(X_test)
    shap_values = xgb_model.predict(dvalid, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
    shap.summary_plot(shap_values[:, :-1], vaild[xgb_model.feature_names])
    
    
    
    filename = 'xgbmodel'+str(d_store_id[store])+'.pkl'
    # save model
    joblib.dump(xgb_model, filename)
    del xgb_model, train, valid, test #, y_valid
    gc.collect()
[23:07:40] WARNING: src/learner.cc:686: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	train-rmse:3.83537	eval-rmse:3.47593
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 15 rounds.
[5]	train-rmse:2.6492	eval-rmse:2.36318
[10]	train-rmse:1.9191	eval-rmse:1.68243
[15]	train-rmse:1.43383	eval-rmse:1.20662
[20]	train-rmse:1.13586	eval-rmse:0.917852
[25]	train-rmse:0.965272	eval-rmse:0.755389
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-37-548e932f933b> in <module>
     46                           evals=watchlist,          # use watchlist for early stopping
     47                           early_stopping_rounds=15, # stop after 50 iterations (trees) without increase in AUC
---> 48                           verbose_eval=5)    
     49 
     50 

~/opt/anaconda3/lib/python3.7/site-packages/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, learning_rates)
    214                            evals=evals,
    215                            obj=obj, feval=feval,
--> 216                            xgb_model=xgb_model, callbacks=callbacks)
    217 
    218 

~/opt/anaconda3/lib/python3.7/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     72         # Skip the first update if it is a recovery step.
     73         if version % 2 == 0:
---> 74             bst.update(dtrain, i, obj)
     75             bst.save_rabit_checkpoint()
     76             version += 1

~/opt/anaconda3/lib/python3.7/site-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
   1107         if fobj is None:
   1108             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, ctypes.c_int(iteration),
-> 1109                                                     dtrain.handle))
   1110         else:
   1111             pred = self.predict(dtrain)

KeyboardInterrupt: 
In [37]:
actual = False
if actual == False:
    #Get the validation results(We already have them as less than one month left for competition to end)
    validation = sales[['id']+['d_' + str(i) for i in range(1914,1942)]]
    validation['id']=pd.read_csv('sales_train_validation.csv').id
    validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
else:
    #Get the actual validation results
    valid['sold'] = valid_preds
    validation = valid[['id','d','sold']]
    validation = pd.pivot(validation, index='id', columns='d', values='sold').reset_index()
    validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
    validation.id = validation.id.map(d_id).str.replace('evaluation','validation')
In [40]:
test['sold'] = eval_preds
evaluation = test[['id','d','sold']]
evaluation = pd.pivot(evaluation, index='id', columns='d', values='sold').reset_index()
evaluation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
#Remap the category id to their respective categories
evaluation.id = evaluation.id.map(d_id)

#Prepare the submission
submit = pd.concat([validation,evaluation]).reset_index(drop=True)
submit.to_csv('xgboostsub.csv',index=False)
In [42]:
subxg=pd.read_csv('xgboostsub.csv')
subxg
Out[42]:
id F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28
0 HOBBIES_1_001_CA_1_validation 0 0 0 2 0 3 5 0 0 1 1 0 2 1 2 2 1 0 2 4 0 0 0 0 3 3 0 1
1 HOBBIES_1_002_CA_1_validation 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 2 1 1 0 0 0 0 0
2 HOBBIES_1_003_CA_1_validation 0 0 1 1 0 2 1 0 0 0 0 2 1 3 0 0 1 0 1 0 2 0 0 0 2 3 0 1
3 HOBBIES_1_004_CA_1_validation 0 0 1 2 4 1 6 4 0 0 0 2 2 4 2 1 1 1 1 1 0 4 0 1 3 0 2 6
4 HOBBIES_1_005_CA_1_validation 1 0 2 3 1 0 3 2 3 1 1 3 2 3 2 2 2 2 0 0 0 2 1 0 0 2 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
33534 HOUSEHOLD_2_512_WI_3_evaluation 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
33535 HOUSEHOLD_2_513_WI_3_evaluation 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
33536 HOUSEHOLD_2_514_WI_3_evaluation 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
33537 HOUSEHOLD_2_515_WI_3_evaluation 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
33538 HOUSEHOLD_2_516_WI_3_evaluation 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

33539 rows × 29 columns

In [44]:
subsec=pd.read_csv('submissionsecondtime.csv')
subsec
Out[44]:
id F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28
0 HOBBIES_1_001_CA_1_validation 0.000000 0.000000 0.000000 2.000000 0.000000 3.000000 5.000000 0.000000 0.000000 1.000000 1.000000 0.000000 2.000000 1.000000 2.000000 2.000000 1.000000 0.000000 2.000000 4.000000 0.000000 0.000000 0.000000 0.000000 3.000000 3.000000 0.000000 1.000000
1 HOBBIES_1_002_CA_1_validation 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 2.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2 HOBBIES_1_003_CA_1_validation 0.000000 0.000000 1.000000 1.000000 0.000000 2.000000 1.000000 0.000000 0.000000 0.000000 0.000000 2.000000 1.000000 3.000000 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 2.000000 0.000000 0.000000 0.000000 2.000000 3.000000 0.000000 1.000000
3 HOBBIES_1_004_CA_1_validation 0.000000 0.000000 1.000000 2.000000 4.000000 1.000000 6.000000 4.000000 0.000000 0.000000 0.000000 2.000000 2.000000 4.000000 2.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 4.000000 0.000000 1.000000 3.000000 0.000000 2.000000 6.000000
4 HOBBIES_1_005_CA_1_validation 1.000000 0.000000 2.000000 3.000000 1.000000 0.000000 3.000000 2.000000 3.000000 1.000000 1.000000 3.000000 2.000000 3.000000 2.000000 2.000000 2.000000 2.000000 0.000000 0.000000 0.000000 2.000000 1.000000 0.000000 0.000000 2.000000 1.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60975 HOUSEHOLD_2_516_TX_2_evaluation -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594 -0.001594
60976 HOUSEHOLD_2_516_TX_3_evaluation -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290 -0.001290
60977 HOUSEHOLD_2_516_WI_1_evaluation -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847 -0.000847
60978 HOUSEHOLD_2_516_WI_2_evaluation -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194 -0.001194
60979 HOUSEHOLD_2_516_WI_3_evaluation -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174 -0.000174

60980 rows × 29 columns

In [ ]:
 
In [ ]: